library(rtweet)
library(httpuv)
library(tidyverse)
library(tidytext)
library(wordcloud2)
library(qdapRegex)
library(tm)
library(webshot)
library(htmlwidgets)
library(ggplot2)
library(sf)
library(tmap)
# Pull tweets with #CancelStudentDebt; returns 1000 most recent tweets; time by GMT
student_debt_tweets<-search_tweets(q="#CancelStudentDebt",
n=1000,
include_rts=FALSE,
`-filter`="replies",
lang="en")
# Pull tweets with #CancelStudentDebt AND capitalism
student_debt_capitalism_tweets<-search_tweets(q="#CancelStudentDebt capitalism",
n=1000,
include_rts=FALSE,
`-filter`="replies",
lang="en")
# Insteading of pulling from the API, you could also pull tweets with #CancelStudentDebt, and then query the text
# of these tweets locally using a stringr function
student_debt_capitalism_tweets_ALT<-student_debt_tweets %>%
filter(str_detect(text, "[Cc]apitalism"))
# Pull tweets with #CancelStudentDebt OR capitalism
student_debt_OR_capitalism_tweets<-search_tweets(q="#CancelStudentDebt OR capitalism",
n=1000,
include_rts=FALSE,
`-filter`="replies",
lang="en")
# # Pull tweets from an account (doesn't have same time constraints)
# Pull last 3200 BLM tweets (note sometimes the query will return less than 3200 due to deletions)
blm_tweets<-get_timeline("@Blklivesmatter", n=3200)
blm_tweets_most_favorited<-blm_tweets %>% slice_max(favorite_count, n=10)
blm_tweets_most_favorited<- blm_tweets_most_favorited %>%
select(created_at, screen_name, text, favorite_count)
blm_tweets_most_favorited
## # A tibble: 10 x 4
## created_at screen_name text favorite_count
## <dttm> <chr> <chr> <dbl>
## 1 2020-08-29 03:31:41 Blklivesmat… "Beyond painful. Rest in Pow… 53597
## 2 2021-01-06 19:42:10 Blklivesmat… "So we all just gonna act li… 46103
## 3 2020-06-15 15:34:41 Blklivesmat… "Until today, you could be f… 44755
## 4 2020-05-31 03:21:51 Blklivesmat… "We call for an END to syste… 33327
## 5 2020-05-29 21:39:43 Blklivesmat… "Rest in Power, Beautiful. @… 32572
## 6 2020-05-26 17:38:48 Blklivesmat… "His name was George Floyd. … 31084
## 7 2020-06-09 21:24:28 Blklivesmat… "You have changed us forever… 29776
## 8 2020-06-05 14:04:00 Blklivesmat… "Happy 27th birthday, Breonn… 26147
## 9 2020-10-11 00:22:13 Blklivesmat… "#BlackLivesMatter rises wit… 23265
## 10 2020-06-03 11:59:10 Blklivesmat… "When people take to the str… 22675
blm_tweets_most_retweeted<-blm_tweets %>%
slice_max(retweet_count, n=10) %>%
select(created_at, screen_name, text, retweet_count)
blm_tweets_most_retweeted
## # A tibble: 10 x 4
## created_at screen_name text retweet_count
## <dttm> <chr> <chr> <dbl>
## 1 2020-08-27 02:50:05 Blklivesmatt… "FUCK THIS MAN!!!! WE DEMAND… 264125
## 2 2020-10-11 01:40:00 Blklivesmatt… "A thread on what’s happenin… 51097
## 3 2020-05-03 17:42:05 Blklivesmatt… "*Blinks in BLM* https://t.… 48906
## 4 2021-01-07 12:40:38 Blklivesmatt… "They've killed us for less!" 43303
## 5 2020-06-09 00:10:55 Blklivesmatt… "3 million students attend s… 41545
## 6 2020-07-18 16:50:58 Blklivesmatt… "55 years ago today, we were… 40229
## 7 2020-12-24 00:46:49 Blklivesmatt… "Move, Mitch, get out the wa… 39516
## 8 2020-05-03 17:42:58 Blklivesmatt… "Think about how harshly #Bl… 39207
## 9 2020-06-14 16:39:14 Blklivesmatt… "A heartbreaker. \n\nNext we… 28458
## 10 2021-01-18 18:38:11 Blklivesmatt… "A thread of Dr. King in col… 28395
blm_tweets_noretweets<-blm_tweets %>% filter(is_retweet=="FALSE")
student_debt_tweets_frequentweeters<-student_debt_tweets %>%
count(screen_name) %>%
slice_max(n, n=5)
ts_plot(student_debt_tweets, "hours") +
labs(x = NULL, y = NULL,
title = "Frequency of tweets with a #CancelStudentDebt hashtag",
subtitle = paste0(format(min(student_debt_tweets$created_at), "%d %B %Y"),
" to ",
format(max(student_debt_tweets$created_at),"%d %B %Y")),
caption = "Data collected from Twitter's REST API via rtweet") +
theme_minimal()
# Extract lat/longs
student_debt_tweets<-student_debt_tweets %>% lat_lng()
# remove records without geotags
student_debt_tweets_latlong_extract<-student_debt_tweets %>%
filter(is.na(lat) == FALSE & is.na(lng) == FALSE)
# create sf object from tweet dataset
student_debt_tweets_latlong_extract<-student_debt_tweets_latlong_extract %>%
st_as_sf(coords=c("lng", "lat")) %>%
st_set_crs("EPSG:4326")
# set tmap to view mode
tmap_mode("view")
## tmap mode set to interactive viewing
# make map
tm_shape(student_debt_tweets_latlong_extract)+
tm_dots()
blm_text<-str_c(blm_tweets$text, collapse="")
blm_text <-
blm_text %>%
str_remove("\\n") %>% # remove linebreaks
rm_twitter_url() %>% # Remove URLS
rm_url() %>%
str_remove_all("#\\S+") %>% # Remove any hashtags
str_remove_all("@\\S+") %>% # Remove any @ mentions
removeWords(stopwords("english")) %>% # Remove common words (a, the, it etc.)
removeNumbers() %>%
stripWhitespace() %>%
removeWords(c("amp")) # Final cleanup of other small changes
textCorpus <-
Corpus(VectorSource(blm_text)) %>%
TermDocumentMatrix() %>%
as.matrix()
textCorpus <- sort(rowSums(textCorpus), decreasing=TRUE)
textCorpus <- data.frame(word = names(textCorpus), freq=textCorpus, row.names = NULL)
textCorpus<-textCorpus %>% filter(word!="the")
wordcloud_blm <- wordcloud2(data = textCorpus, minRotation = 0, maxRotation = 0, ellipticity = 0.2)
wordcloud_blm
You can write out your word cloud to disk with the following:
install_phantomjs()
saveWidget(wordcloud_blm, "blm.html", selfcontained = F)
webshot("blm.html", "blm.png", vwidth=1000, vheight=1000, delay=10)
twitter_wordcloud<-function(twitterhandle, tweet_number){
tweet_timeline<-get_timeline(twitterhandle, n=tweet_number)
tweet_timeline_text<-str_c(tweet_timeline$text, collapse="")
tweet_timeline_text<-tweet_timeline_text %>%
str_remove("\\n") %>% # remove linebreaks
rm_twitter_url() %>% # Remove URLS
rm_url() %>%
str_remove_all("#\\S+") %>% # Remove any hashtags
str_remove_all("@\\S+") %>% # Remove any @ mentions
removeWords(stopwords("english")) %>% # Remove common words (a, the, it etc.)
removeNumbers() %>%
stripWhitespace() %>%
removeWords(c("amp")) %>%
removePunctuation() %>%
str_remove_all(pattern='[Tt]he') %>%
str_remove_all(pattern='[:emoji:]')
textCorpus <-
Corpus(VectorSource(tweet_timeline_text)) %>%
TermDocumentMatrix() %>%
as.matrix()
textCorpus <- sort(rowSums(textCorpus), decreasing=TRUE)
textCorpus <- data.frame(word = names(textCorpus), freq=textCorpus, row.names = NULL)
wordcloud <- wordcloud2(data = textCorpus, minRotation = 0, maxRotation = 0, ellipticity = 0.2)
return(wordcloud)
}
# Generate word cloud for past 400 NYT twitter posts, assign to object, and view word cloud
nyt_wordcloud<-twitter_wordcloud("nytimes", 400)
# View NYT wordcloud
nyt_wordcloud
Apply the “twitter_wordcloud” function created above to multiple handles, and generate multiple word clouds based on those handles. We’ll apply the function to the Twitter handles of the New York Times, Financial Times, Washington Post, Fox News, CNN, and the Denver Post.
handles<-c("nytimes", "FinancialTimes", "FoxNews", "cnn", "washingtonpost", "denverpost")
number<-c(400)
wordcloud_list<-map2(.x=handles, .y=number, twitter_wordcloud)
View the Washington Post word cloud by accessing it from the list:
# View Washington Post Word Cloud
wordcloud_list[["washingtonpost"]]
View the Denver Post word cloud by accessing from the list:
# View Denver Post word cloud by extracting it from the list
wordcloud_list[["denverpost"]]
setwd("~/Documents/git_repositories/twitter_workshop")
knitr::include_graphics("images/denverpost.png")
What would you type if you want to extract the Financial Times word cloud from the list?
# Write function that takes list of word clouds, and word cloud names, and writes WC out to tisk
output_wordclouds<-function(wordclouds_to_export, wordcloud_names){
setwd("/Users/adra7980/Documents/git_repositories/twitter_workshop/wordclouds")
install_phantomjs()
saveWidget(wordclouds_to_export, paste0(wordcloud_names, ".html"), selfcontained=F)
webshot(paste0(wordcloud_names, ".html"), paste0(wordcloud_names, ".png"), vwidth=1992, vheight=1744, delay=10)
}
# iteratively apply previous function across word clouds in list and write all to disk
map2(.x=wordcloud_list, .y=names(wordcloud_list), .f=output_wordclouds)